//KD tree node structure test
// - mirrors kdnode definition found in GPULayout.h
struct kdnode{
    float2 p;
    float d;
    float w;
    uint s;
    uint t;
    int l;
    int r;
};

float2 rep(float eta, float C, float w1, float w2, float2 r){
    return max(w1 * w2, 1.f) * (C / pow(dot(r,r) + eta, 0.18f)) * r;
}
//BACKUP   return max(w1 * w2, 1.f) * (C / pow(dot(r,r) + eta, 1.0f)) * r;
// F             to r^(-?):r^(0.5/2) = r^(0.25) ...... try 0310 1130 CHRIS backup
// F             to r^(-0.25):r^(1.25/2) = r^(0.625) ...... 
// F             to r^(-0.5):r^(1.5/2) = r^(0.75) ...... try 0310 1130
// F             to r^(-1) : r^(2/2) = r^(1.0) ..... try 0310 1130: seems to work! yay
// F corresponds to r^(-2) : r^(3/2) = r^(1.5) ..... DEFAULT
// F corresponds to r^(-3) : r^(4/2) = r^(2.0)
// F                r^(-4) : r^(5/2) = r^(2.5)

float2 edgeForce(int n,
                 float2 p,
                 __global float2 *nodep,
                 __global int *CSRN,
                 __global int *CSRE,
                 __global float *edgew,
                 float eta,
                 float desLength)
{
    int s = CSRN[n];
    int t = CSRN[n+1];

    float2 f = (float2)(0.f, 0.f);
    float2 r = 1.f;
    float d = 1.f;
    float w = 1.f;

    for(int e = s; e < t; e++){
        r = p - nodep[CSRE[e]];
        d = length(r);
        w = edgew[e];
        f -= max(w, 1.f) * d * log((d+eta) / desLength) * r ;
    }
//        f -= max(w, 1.f) * d * log((d+eta) / desLength) * r ;
    return f;
}


float2 interNodeRep(uint l, float eta, float C, __global struct kdnode *kd,
                    __local float2 *p, __local float *w)
{
    uint lID = get_local_id(0);
    int k = kd[k].l;
    float2 f = (float2)(0.f, 0.f);
    float2 r = (float2)(0.f, 0.f);
    float rl = 0.f;

    while(k >= 0){
        if(k == l){ k = kd[k].r; continue; } //ignore own leaf

        r = p[lID] - kd[k].p;
        rl = length(r);

        if(kd[k].d!=0 && rl!=0 && kd[k].d/rl > 1.2f && kd[k].l >= 0){ //descend
            k = kd[k].l;
        }else{ //add contribution and go to neighbor
            f = f + rep(eta, C, w[lID], kd[k].w, r);
            k = kd[k].r;
        }
    }

    return f;
}

float2 intraNodeRep(float eta, float C, unsigned int ns,
                    __local float2 *p, __local float *w)
{
    uint lID = get_local_id(0);
    float2 f = (float2)(0.f, 0.f);
    float2 r = (float2)(0.f, 0.f);
    uint i;

    for(i = 0; i < ns; i++){
        r = p[lID] - p[i];
        f = f + rep(eta, C, w[lID], w[i], r);
    }

    return f;
}




float2 applyForce(float2 p, float2 f, float T)
{
    if(any(isnotequal(f, (float2)(0.f, 0.f)))){
        return (p + fmin(length(f), T)*normalize(f));
    }else return p;
}


__kernel void stepKern(
          /* 0  */ __global float2 *nodep,
          /* 1  */ __global int *CSRN,
          /* 2  */ __global int *CSRE,
          /* 3  */ __global float *nodew,
          /* 4  */ __global float *edgew,
          /* 5  */ __global struct kdnode *kdnodes,
          /* 6  */ uint KDOffset, //index of first leaf node
          /* 7  */ float eta, //softening factor to eliminate divide by 0
          /* 8  */ float C, //repulsive constant
          /* 9  */ float desLength, //desired spring length
          /* 10 */ float T, //temperature
          /* 11 */ __local float2 *p,
          /* 12 */ __local float *w)
{
    int kID = KDOffset + get_group_id(0);
    struct kdnode k = kdnodes[kID];
    int n = k.s + get_local_id(0);
    uint lID = get_local_id(0);

    barrier(CLK_LOCAL_MEM_FENCE); //sync to coalesce memory access
    p[lID] = nodep[n];
    barrier(CLK_LOCAL_MEM_FENCE);
    w[lID] = nodew[n];

    if( n < k.t){
        float2 f = (float2)(0.f, 0.f);

        f += interNodeRep(kID, eta, C, kdnodes, p, w);
        f += intraNodeRep(eta, C, k.t-k.s, p, w);
        f += edgeForce(n, p[lID], nodep, CSRN, CSRE, edgew, eta, desLength);

        nodep[n] = applyForce(p[lID], f, T);
    }
}


float2 timestepForce(int n,
                     float2 p,
                     __global float2 *ptm1,
                     __global int *tmap,
                     float eta,
                     float corrConst)
{
    int i0 = tmap[n];
    if(i0 >= 0){
        float2 p0 = ptm1[i0];
        float2 r = p0 - p;
                return corrConst * r * pow(length(r),.5); // 0322 yuhsuan try
//        return corrConst * r * pow(length(r),.25); //chris
//        return corrConst * r * length(r);
        //return (p0 - p);
    }else{
        //f = f + rep(eta, C, w[lID], w[i], r);
        return (float2)(0.f);
    }
}


__kernel void timestepKern(
          /* 0  */ __global float2 *nodep,
          /* 1  */ __global int *CSRN,
          /* 2  */ __global int *CSRE,
          /* 3  */ __global float *nodew,
          /* 4  */ __global float *edgew,
          /* 5  */ __global struct kdnode *kdnodes,
          /* 6  */ uint KDOffset, //index of first leaf node
          /* 7  */ float eta, //softening factor to eliminate divide by 0
          /* 8  */ float C, //repulsive constant
          /* 9  */ float desLength, //desired spring length
          /* 10 */ float T, //temperature
          /* 11 */ __local float2 *p,
          /* 12 */ __local float *w,
          /* 13 */ float corrConst, //inter-timestep force constant
          /* 14 */ __global float2 *nodetm1,
          /* 15 */ __global int *tmap)
{
    int kID = KDOffset + get_group_id(0);
    struct kdnode k = kdnodes[kID];
    int n = k.s + get_local_id(0);
    uint lID = get_local_id(0);

    barrier(CLK_LOCAL_MEM_FENCE); //sync to coalesce memory access
    p[lID] = nodep[n];
    barrier(CLK_LOCAL_MEM_FENCE);
    w[lID] = nodew[n];

    if( n < k.t){
        float2 f = (float2)(0.f, 0.f);

        f += interNodeRep(kID, eta, C, kdnodes, p, w);
        f += intraNodeRep(eta, C, k.t-k.s, p, w);
        f += edgeForce(n, p[lID], nodep, CSRN, CSRE, edgew, eta, desLength);

        f += timestepForce(n, p[lID], nodetm1, tmap, eta, corrConst);

        nodep[n] = applyForce(p[lID], f, T);
    }
}
